This document provides an analysis of the Heart Failure Dataset, focusing on descriptive statistics, survival outcome analysis, and interactive visualizations.
# Load the dataset
data <- read.csv("~/Heart_Failure_Final.csv")
# Remove unnecessary columns
data <- data %>% select(-starts_with("Unnamed"))
# Display the first few rows
head(data)
## X age anaemia creatinine_phosphokinase diabetes ejection_fraction
## 1 1 75 No 582 No 20
## 2 2 55 No 7861 No 38
## 3 3 65 No 146 No 20
## 4 4 50 Yes 111 No 20
## 5 5 65 Yes 160 Yes 20
## 6 6 90 Yes 47 No 40
## high_blood_pressure platelets serum_creatinine serum_sodium sex smoking
## 1 Yes 265000 1.9 130 Male No
## 2 No 263358 1.1 136 Male No
## 3 No 162000 1.3 129 Male Yes
## 4 No 210000 1.9 137 Male No
## 5 No 327000 2.7 116 Female No
## 6 Yes 204000 2.1 132 Male Yes
## DEATH_EVENT ...13 ...14 ...15 ...16 ...17
## 1 Deceased NA NA NA NA NA
## 2 Deceased NA NA NA NA NA
## 3 Deceased NA NA NA NA NA
## 4 Deceased NA NA NA NA NA
## 5 Deceased NA NA NA NA NA
## 6 Deceased NA NA NA NA NA
summary(data)
## X age anaemia creatinine_phosphokinase
## Min. : 1.0 Min. :40.00 Length:299 Min. : 23.0
## 1st Qu.: 75.5 1st Qu.:51.00 Class :character 1st Qu.: 116.5
## Median :150.0 Median :60.00 Mode :character Median : 250.0
## Mean :150.0 Mean :60.83 Mean : 581.8
## 3rd Qu.:224.5 3rd Qu.:70.00 3rd Qu.: 582.0
## Max. :299.0 Max. :95.00 Max. :7861.0
## diabetes ejection_fraction high_blood_pressure platelets
## Length:299 Min. :14.00 Length:299 Min. : 25100
## Class :character 1st Qu.:30.00 Class :character 1st Qu.:212500
## Mode :character Median :38.00 Mode :character Median :262000
## Mean :38.08 Mean :263358
## 3rd Qu.:45.00 3rd Qu.:303500
## Max. :80.00 Max. :850000
## serum_creatinine serum_sodium sex smoking
## Min. :0.500 Min. :113.0 Length:299 Length:299
## 1st Qu.:0.900 1st Qu.:134.0 Class :character Class :character
## Median :1.100 Median :137.0 Mode :character Mode :character
## Mean :1.394 Mean :136.6
## 3rd Qu.:1.400 3rd Qu.:140.0
## Max. :9.400 Max. :148.0
## DEATH_EVENT ...13 ...14 ...15 ...16
## Length:299 Mode:logical Mode:logical Mode:logical Mode:logical
## Class :character NA's:299 NA's:299 NA's:299 NA's:299
## Mode :character
##
##
##
## ...17
## Mode:logical
## NA's:299
##
##
##
##
p <- ggplot(data, aes(x = age)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "black") +
theme_minimal() +
labs(title = "Age Distribution", x = "Age", y = "Count")
p
gender_plot <- ggplot(data, aes(x = factor(sex, labels = c("Female", "Male")), fill = factor(DEATH_EVENT, labels = c("Survived", "Died")))) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = "Death Event by Gender", x = "Gender", y = "Proportion", fill = "Outcome")
gender_plot
sodium_plot <- ggplot(data, aes(x = factor(DEATH_EVENT, labels = c("Survived", "Died")), y = serum_sodium)) +
geom_boxplot(fill = "lightgreen") +
theme_minimal() +
labs(title = "Serum Sodium Levels by Survival Outcome", x = "Outcome", y = "Serum Sodium")
sodium_plot
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
This analysis provides insights into the key features of the Heart Failure Dataset and their relationships to survival outcomes. Further analysis could include predictive modeling or more detailed survival analysis.